#!/usr/bin/env Rscript
# coding: utf-8

#===============================================================================
#  Tom Paskhalis, 2020-12-18
#
#  Replication script (analysis and write up) for 
#  Denisa Kostovicova, Tom Paskhalis. "Gender, Justice and Deliberation:
#  Why Women Don't Influence Peace-making" International Studies Quarterly
# 
#===============================================================================

library("readr")
library("dplyr")
library("tidyr")
library("knitr")
library("kableExtra")
library("quanteda")
library("rstan")
library("rstanarm")
library("bayesplot")
library("ggplot2")
library("cowplot")
library("stm")

# Read-in datasets
# Text
load("./data/dqiCorpus.RData")
load("./data/dqiCorpusLemma.RData")
stopwords_sr_lemmas <- readr::read_lines("./data/stopwords_sr_lemmas.txt")
stopwords_sr_lemmas <- stopwords_sr_lemmas[stopwords_sr_lemmas != ""]
# DQI/Consultations
coded <- readr::read_csv("./data/DQITJ1_Master.csv")
participants <- readr::read_csv("./data/participants.csv")


# Table 1 -----------------------------------------------------------------

tab1 <- participants %>%
  dplyr::select(-consultation) %>%
  dplyr::summarise(
    Presence.women = mean(women.discussants/participants),
    Presence.men = mean(men.discussants/participants),
    Speech.women = mean((w.speech + w.sas)/women.discussants, na.rm = TRUE),
    Speech.men = mean((m.speech + m.sas)/men.discussants),
    Argumentation.women = mean(w.sas/(w.speech + w.sas), na.rm = TRUE),
    Argumentation.men = mean(m.sas/(m.speech + m.sas))
  ) %>%
  tidyr::gather(param, value) %>%
  tidyr::separate(param, into = c("param", "gender")) %>%
  tidyr::spread(gender, value) %>%
  dplyr::mutate(women = paste0(round(women * 100, 1), "%"),
                men = paste0(round(men * 100, 1), "%")) %>%
  knitr::kable(col.names = c(" ", "\\textbf{Men}", "\\textbf{Women}"),
               format = "latex", escape = FALSE)

cat(tab1)


# Figure 1 ----------------------------------------------------------------

# dqi_components <- coded %>%
#   dplyr::select(
#     PART,
#     JUST_RAT,
#     CONT_COLL,
#     CONT_DIFF,
#     CONT_APR,
#     RES_PAR,
#     RES_GR,
#     STO_JUS
#   )
# 
# # Dichotomised
# dqi_components <- dqi_components %>%
#   dplyr::mutate(
#     JUST_RAT = ifelse(JUST_RAT == 0 | JUST_RAT == 1, 0, 1),
#     CONT_COLL = ifelse(CONT_COLL == 2, 0, 1),
#     RES_PAR = ifelse(RES_PAR == 0 | RES_PAR == 1, 0, 1),
#     RES_GR = ifelse(RES_GR == 0 | RES_GR == 1, 0, 1),
#     STO_JUS = ifelse(STO_JUS == 0 , 0, 1)
#   )
# 
# # Model definition
# stan.code <- "
# data {
#   int<lower=1> J; // number of speech acts(SAs)
#   int<lower=1> K; // number of DQI components
#   int<lower=1> N; // number of observations, N = J x K
#   int<lower=1,upper=J> jj[N]; // SA for observation n
#   int<lower=1,upper=K> kk[N]; // DQI component for observation n
#   int<lower=0,upper=1> y[N]; // indicator if SA j satisfies criterion k
# }
# parameters {
#   vector[J] alpha;
#   vector[K] gamma;
#   vector[K] beta;
#   real mu_beta;
#   real<lower=0.1> sigma_beta;
#   real<lower=0.1> sigma_gamma;
# }
# model {
#   alpha ~ normal(0, 1);
#   beta ~ normal(mu_beta, sigma_beta); // DQI components are given mean-centred parametrization
#   mu_beta ~ cauchy(0, 5);
#   sigma_beta ~ cauchy(0, 5);
#   gamma ~ lognormal(0, sigma_gamma);
#   sigma_gamma ~ cauchy(0, 5);
#   for (n in 1:N)
#     y[n] ~ bernoulli_logit(gamma[kk[n]] * (alpha[jj[n]] - beta[kk[n]]));
# }
# "
# 
# # Data preparation for Stan
# y <- c(as.matrix(dqi_components))
# J <- nrow(dqi_components)
# K <- ncol(dqi_components)
# N <- J * K
# jj <- rep(1:J, times = K)
# kk <- rep(1:K, each = J)
# 
# stan.data <- list(y = y, J = J, K = K, N = N, jj = jj, kk = kk)
# 
# alpha.init <- dqi_components %>%
#   apply(1, sum)
# alpha.init <- ifelse(alpha.init > mean(alpha.init), 1, -1)
# 
# stan.inits <- rep(list(list(alpha = alpha.init,
#                             beta = rnorm(K),
#                             mu_beta = 0,
#                             sigma_beta = 1,
#                             gamma = rlnorm(K),
#                             sigma_gamma = 1)), 2)
# 
# options(mc.cores = parallel::detectCores())
# # Model compilation
# stan.fit <- rstan::stan(
#   model_code = stan.code,
#   data = stan.data,
#   iter = 10000,
#   warmup = 1000,
#   chains = 3,
#   thin = 2,
#   init = stan.inits,
#   verbose = FALSE
# )

# save(stan.fit, "./results/dqi_irt_stan_fit.RData")

load("./results/dqi_irt_stan_fit.RData")

meta <- summary(dqiCorpusLemma, n = ndoc(dqiCorpusLemma))

DQI <- c(
  "PART",
  "JUST_RAT",
  "CONT_COLL",
  "CONT_DIFF",
  "CONT_APR",
  "RES_PAR",
  "RES_GR",
  "STO_JUS"
)

alpha.coeffs <- rstan::extract(stan.fit, pars = "alpha") %>%
  `[[`(., 1) %>%
  t() %>%
  as.data.frame() %>%
  apply(., 1, mean) %>%
  data.frame(stan.mean = .)

# Component difficulty and discrimination posterior estimates
gamma.coeffs <- rstan::extract(stan.fit, pars = "gamma") %>%
  `[[`(., 1) %>%
  as.data.frame() %>%
  setNames(., DQI) %>%
  tidyr::gather(key = component, value = param) %>%
  dplyr::group_by(component) %>%
  dplyr::summarise(mean.gamma = mean(param),
                   q025.gamma = quantile(param, probs = 0.025),
                   q975.gamma = quantile(param, probs = 0.975))

beta.coeffs <- rstan::extract(stan.fit, pars = "beta") %>%
  `[[`(., 1) %>%
  as.data.frame() %>%
  setNames(., DQI) %>%
  tidyr::gather(key = component, value = param) %>%
  dplyr::group_by(component) %>%
  dplyr::summarise(mean.beta = mean(param),
                   q025.beta = quantile(param, probs = 0.025),
                   q975.beta = quantile(param, probs = 0.975))

coeffs <- dplyr::left_join(beta.coeffs, gamma.coeffs, by = "component") %>%
  dplyr::arrange(component)

# Generate random speech act quality
quality <- sort(rnorm(n = 1000))

# Prepare gamma and beta coefficients
components <- coeffs %>%
  dplyr::select(component, mean.gamma, mean.beta) %>%
  tidyr::gather(param, value, mean.gamma:mean.beta) %>%
  tidyr::spread(component, value) %>%
  dplyr::select(-param) %>%
  as.list()

# Calculate predicted probabilities
predprob <- lapply(components, function(j) plogis(j[2] * (quality - j[1]))) %>%
  as.data.frame() %>%
  dplyr::bind_cols(quality = quality) %>%
  tidyr::gather(key = component, value = probability, -quality) %>%
  mutate(component = as.factor(component))

low_sa <- meta[which(grepl("Dalje, clan 8", texts(dqiCorpus))), "said"]
low_dqi <- round(alpha.coeffs[low_sa,], 2)
low_sa <- paste0("SA", low_sa)
low <- paste0(low_sa, " = ", low_dqi)

examples_1 <- tibble(x = c(1, 1),
                     y = c(0.55, -0.55),
                     txt = c("Dalje, član 8. - jako dobra odredba,\ni vi insistirajte na tome da to ostane,\nda su države ugovornice\ndužne sarađivati sa REKOM-om.",
                             "Further, Article 8 - a very good provision,\nand you insist on keeping it,\nso that state parties\nare obliged to cooperate with RECOM."))

fig1_1 <- ggplot(examples_1, aes(x, y, label = txt)) +
  geom_text(size = 1.8) +
  geom_hline(yintercept = 0) +
  ylim(-1,1) +
  labs(title = low,
       x = "",
       y = "") +
  theme(plot.title = element_text(size = rel(0.7), hjust = 0.5),
        panel.border = element_rect(colour = "black", fill = NA),
        panel.background = element_rect(fill = "white", colour = "black"),
        axis.text = element_blank(),
        axis.ticks = element_blank())

high_sa <- meta[which(grepl("Na konsultacijama u Mostaru", texts(dqiCorpus))), "said"]
high_dqi <- round(alpha.coeffs[high_sa,], 2)
high_sa <- paste0("SA", high_sa)
high <- paste0(high_sa, " = ", high_dqi)

examples_2 <- tibble(x = c(1, 1),
                     y = c(0.55, -0.55),
                     txt = c("Na konsultacijama u Mostaru\nje od predstavnika ljudi s Kosova bilo\nizričito zahtevano i se stave prisilni\n nestanci nakon oružanog sukoba.\nNaime, to je bio problem kod njih.\nA nije uslo u nacrt statuta.\nTako da svaka konsultacija\niznjedri nešto novo u statutu.\nI zato smatram da bi možda bilo dobro\nprihvatiti inicijativu\nšto je dosla iz Bosne i Hercegovine\nda se član 14.\nPromeni i da ide, kako rekla,\nsva kršenja međunarodnog prava\ni kršenja protiv čovečnosti. Hvala.",
                             "At the consultations in Mostar,\nthe representatives from Kosovo\nexplicitly requested that forced disappearances\nafter an armed conflict be also included.\nNamely, that was an issue there,\nbut it was not included into the draft Statute.\nThus, every consultation\nproduces something new in the Statute.\nAnd that's why I think it might be good\nto accept the initiative\nthat came from Bosnia and Herzegovina\nto change Article 14\nand for it to say\nall violations of international law\nand crimes against humanity. Thank you"))

fig1_3 <- ggplot(examples_2, aes(x, y, label = txt)) +
  geom_text(size = 1.8) +
  geom_hline(yintercept = 0) +
  ylim(-1,1) +
  labs(title = high,
       x = "",
       y = "") +
  theme(plot.title = element_text(size = rel(0.7), hjust = 0.5),
        panel.border = element_rect(colour = "black", fill = NA),
        panel.background = element_rect(fill = "white", colour = "black"),
        axis.text = element_blank(),
        axis.ticks = element_blank())

palette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
# Colour and linetype version
fig1_2 <- ggplot(predprob, aes(x = quality, y = probability)) +
  # geom_ribbon(aes(ymin = q025, ymax = q975, fill = item), alpha = 0.1) +
  geom_line(aes(colour = component, linetype = component)) +
  geom_segment(aes(x = low_dqi, y = 0.05, xend = low_dqi, yend = 0),
               size = 0.2,
               arrow = arrow(length = unit(0.05, "cm"))) +
  geom_segment(aes(x = high_dqi, y = 0.05, xend = high_dqi, yend = 0),
               size = 0.2,
               arrow = arrow(length = unit(0.05, "cm"))) +
  geom_text(aes(x = low_dqi, y = 0.07, label = low_sa),
            size = 1.8,
            check_overlap = TRUE) +
  geom_text(aes(x = high_dqi, y = 0.07, label = high_sa),
            size = 1.8,
            check_overlap = TRUE) +
  coord_cartesian(xlim = c(-2.5, 2.5), ylim = c(-0.02, 1.02), expand = FALSE) +
  labs(x = "Discourse quality",
       y = "Probability of meeting a criterion",
       colour = "Component",
       linetype = "Component") +
  scale_colour_manual(values = palette,
                      labels = c("Content (abstract)",
                                 "Content (common good)",
                                 "Content (difference)",
                                 "Justification Rationality",
                                 "Interruption",
                                 "Respect (Groups)",
                                 "Respect (Participants)",
                                 "Story Justification"),
                      guide = guide_legend(title.position = "top")) +
  scale_linetype_discrete(labels = c("Content (abstract)",
                                     "Content (common good)",
                                     "Content (difference)",
                                     "Justification Rationality",
                                     "Interruption",
                                     "Respect (Groups)",
                                     "Respect (Participants)",
                                     "Story Justification"),
                          guide = guide_legend(title.position = "top")) +
  theme_bw() +
  theme(legend.position = "bottom",
        legend.box.margin = margin(0, 0, 0, 0),
        # legend.box.spacing = unit(0, "lines"),
        legend.key.size = unit(0.5,"cm"),
        legend.title.align = 0.5,
        legend.title = element_text(size=rel(0.6)),
        legend.text = element_text(size=rel(0.45)),
        axis.title.x = element_text(size = rel(0.7), colour = "black"),
        axis.title.y = element_text(size = rel(0.7), colour = "black"),
        axis.text.x = element_text(size = rel(0.5), colour = "black"),
        axis.text.y = element_text(size = rel(0.5), colour = "black"),
        axis.line = element_line(size = rel(0.3)),
        axis.ticks.x = element_line(size = rel(0.3)),
        axis.ticks.y = element_line(size = rel(0.3)))

# fig2_colour_linetype <- gridExtra::arrangeGrob(
#   fig2_1, fig2_5, fig2_3,
#   widths = c(1,2,1), ncol = 3, nrow = 1
# )

fig1_colour_linetype <- cowplot::plot_grid(
  fig1_1, fig1_2, fig1_3,
  rel_widths = c(1,2,1), ncol = 3
)

# EPS
ggsave(
  "./figures/figure_1.eps",
  plot = fig1_colour_linetype,
  device = cairo_ps,
  width = 220,
  height = 100,
  units = "mm"
)


# Table 2 -----------------------------------------------------------------

# meta <- summary(dqiCorpusLemma, n = quanteda::ndoc(dqiCorpusLemma))
# 
# meta <- meta %>%
#   # Leave only discussants
#   dplyr::filter(type == "DISC") %>%
#   dplyr::left_join((coded %>%
#                       dplyr::select(CON_NUM, CON_DIV, TRANS) %>%
#                       dplyr::group_by(CON_NUM) %>%
#                       dplyr::summarise(diversity = first(CON_DIV),
#                                        translation = first(TRANS)) %>%
#                       dplyr::left_join(participants %>%
#                                          mutate(perc_women_disc = (women.discussants/(women.discussants + men.discussants)) * 100) %>%
#                                          select(cid, perc_women_disc), c("CON_NUM" = "cid"))), by = c("cid" = "CON_NUM")) %>%
#   dplyr::mutate(diversity = factor(as.character(diversity), labels = c("MULTI", "DYAD", "MONO")),
#                 translation = factor(translation, labels = c("YES", "NO"))) %>%
#   # Group by speech, first instance should be identical to all the others
#   dplyr::group_by(uid) %>%
#   dplyr::summarise(tokens = sum(Tokens),
#                    type = first(type),
#                    gender = first(gender),
#                    cid = first(cid),
#                    diversity = first(diversity),
#                    level = first(level),
#                    community = first(community),
#                    translation = first(translation),
#                    perc_women_disc = first(perc_women_disc)) %>%
#   dplyr::mutate(gender = relevel(gender, ref = "MAL"),
#                 diversity = relevel(diversity, ref = "MONO"),
#                 translation = relevel(translation, ref = "NO"))
# 
# lmer.stan1 <- rstanarm::stan_lmer(log(tokens) ~ gender + (1|cid), data = meta)
# lmer.stan3 <- rstanarm::stan_lmer(log(tokens) ~ gender + perc_women_disc + diversity + level + community + translation + (1|cid), data = meta)

# saveRDS(lmer.stan1, "./results/participation_rstanarm_fit1.rds")
# saveRDS(lmer.stan2, "./results/participation_rstanarm_fit2.rds")

lmer.stan1 <- readRDS("./results/participation_rstanarm_fit1.rds")
lmer.stan3 <- readRDS("./results/participation_rstanarm_fit2.rds")

# unclass is required for correct handling of rownames
out.lmer.stan1 <- tibble::as_tibble(
  unclass(summary(lmer.stan1, probs = c(0.025, 0.975))), rownames = "pars"
) %>%
  dplyr::mutate_at(vars(mean, `2.5%`, `97.5%`), round, 3) %>%
  dplyr::mutate(se = paste0("(", `2.5%`, ", ", `97.5%`, ")")) %>%
  dplyr::select(pars, mean, se) %>%
  tidyr::gather(x, `(1)`, -pars) %>%
  dplyr::select(pars, x, `(1)`)

out.lmer.stan3 <- tibble::as_tibble(
  unclass(summary(lmer.stan3, probs = c(0.025, 0.975))), rownames = "pars"
) %>%
  dplyr::mutate_at(vars(mean, `2.5%`, `97.5%`), round, 3) %>%
  dplyr::mutate(se = paste0("(", `2.5%`, ", ", `97.5%`, ")")) %>%
  dplyr::select(pars, mean, se) %>%
  tidyr::gather(x, `(2)`, -pars) %>%
  dplyr::select(pars, x, `(2)`)

ordering <- c(
  "genderFEM",
  "perc_women_disc",
  "diversityDYAD",
  "diversityMULTI",
  "levelREG",
  "communityPRO",
  "communityVIC",
  "translationYES",
  "(Intercept)",
  "sigma",
  "Sigma[cid:(Intercept),(Intercept)]",
  "log-posterior"
)

labeling <- c(
  "Female",
  "\\% Female Discussants",
  "Dyadic",
  "Multi-Ethnic",
  "Regional",
  "Professionals",
  "Victims",
  "Yes",
  "Intercept",
  "$\\sigma_y$",
  "$\\sigma_{\\alpha}$",
  "log-posterior"
)

# Vector for using in if_else statement below for adding reference levels
reference <- rep(labeling, each = 3)
reference[seq(1,length(reference),3)] <- c(
  "Gender (ref: Male)",
  NA,
  "Diversity (ref: Mono-ethnic)",
  NA,
  "Level (ref: Non-regional)",
  "Type (ref: General)",
  NA,
  "Translation (ref: No)",
  NA,
  NA,
  NA,
  NA
)

tab2 <- out.lmer.stan1 %>%
  dplyr::full_join(out.lmer.stan3, by = c("pars", "x")) %>%
  dplyr::filter(!grepl("^b\\[\\(Intercept\\)|mean_PPD", pars)) %>%
  dplyr::mutate(pars = factor(pars, levels = ordering, labels = labeling)) %>%
  dplyr::mutate(`(1)` = if_else(is.na(`(1)`), "", `(1)`)) %>%
  dplyr::arrange(pars) %>%
  # Add reference levels of categorical variables
  dplyr::group_by(pars) %>%
  dplyr::do(add_row(., .before = 0)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    pars = if_else(is.na(pars), reference, as.character(pars)),
    x = if_else(is.na(x), "", x),
    `(1)` = if_else(is.na(`(1)`), "", `(1)`),
    `(2)` = if_else(is.na(`(2)`), "", `(2)`)) %>%
  dplyr::filter(!is.na(pars)) %>%
  # remove SE for log-posterior
  dplyr::filter(pars != "log-posterior" | x != "se") %>%
  dplyr::select(-x) %>%
  dplyr::bind_rows(tibble::tibble(
    pars = "Groups",
    `(1)` = as.character(attr(summary(lmer.stan1), "ngrps")),
    `(2)` = as.character(attr(summary(lmer.stan3), "ngrps")))
  ) %>%
  dplyr::bind_rows(tibble::tibble(
    pars = "Observations",
    `(1)` = as.character(attr(summary(lmer.stan1), "nobs")),
    `(2)` = as.character(attr(summary(lmer.stan3), "nobs")))
  ) %>%
  knitr::kable(
    col.names = c(" ", "(1)", "(2)"),
    format = "latex",
    booktabs = TRUE,
    align = "c",
    escape = FALSE
  ) %>%
  kableExtra::collapse_rows(columns = 1, latex_hline = "none") %>%
  kableExtra::add_header_above(header = c(" " = 1, "Log(words)" = 2))

cat(tab2)


# Table 3 -----------------------------------------------------------------

# # Merge with estimated DQI scores
# coded <- coded %>%
#   dplyr::mutate(
#     GEN = factor(GEN, labels = c("female", "male")),
#     SP_TYP = factor(SP_TYP, labels = c("unique", "repeat")),
#     ISS_POL = factor(as.character(ISS_POL), labels = c("high", "medium", "low")),
#     CON_DIV = factor(as.character(CON_DIV), labels = c("multi", "dyad", "mono")),
#     CON_LEV = factor(as.character(CON_LEV), labels = c("reg", "nonreg")),
#     CON_TYP = factor(as.character(CON_TYP), labels = c("vic", "gen", "prof")),
#     TRANS = factor(as.character(TRANS), labels = c("yes", "no")),
#     TEMP_JUST = factor(TEMP_JUST, labels = c("no", "past", "presfut")),
#     SUB_COL = factor(SUB_COL, labels = c("no", "coll", "ind")),
#     SUB_ETH = factor(SUB_ETH, labels = c("no", "eth", "non-eth", "multi-eth")),
#     STO_POS = factor(STO_POS, labels = c("no", "group", "others", "pers"))
#   ) %>%
#   # Combine levels of SUB_ETH and STO_POS
#   dplyr::mutate(
#     SUB_ETH = factor(ifelse(SUB_ETH == "no" | SUB_ETH == "non-eth", "no/non-eth", "eth/multi-eth")),
#     STO_POS = factor(ifelse(STO_POS == "group" | STO_POS == "others", "group/others", STO_POS))
#   ) %>%
#   # Fix factor labels
#   dplyr::mutate(STO_POS = factor(STO_POS, labels = c("no", "pers", "group/others"))) %>%
#   # Reorder levels
#   dplyr::mutate(
#     GEN = relevel(GEN, ref = "male"),
#     ISS_POL = factor(ISS_POL, levels = c("low", "medium", "high")),
#     CON_DIV = factor(CON_DIV, levels = c("mono", "dyad", "multi")),
#     CON_LEV = factor(CON_LEV, levels = c("nonreg", "reg")),
#     CON_TYP = factor(CON_TYP, levels = c("gen", "prof", "vic")),
#     TRANS = relevel(TRANS, ref = "no"),
#     SUB_ETH = relevel(SUB_ETH, ref = "no/non-eth")
#   ) %>%
#   dplyr::bind_cols(alpha.coeffs) %>%
#   dplyr::rename(DQI = stan.mean)
# 
# dqi.lmer1 <- rstanarm::stan_lmer(DQI ~ GEN + (1 | CON_NUM), data = coded)
# dqi.lmer2 <- rstanarm::stan_lmer(DQI ~ GEN + SP_TYP + ISS_POL + (1 | CON_NUM), data = coded)
# dqi.lmer3 <- rstanarm::stan_lmer(DQI ~ GEN + SP_TYP + ISS_POL + CON_DIV + CON_LEV + CON_TYP + TRANS + (1 | CON_NUM), data = coded)
# 
# saveRDS(dqi.lmer1, "./results/dqi_rstanarm_fit1.rds")
# saveRDS(dqi.lmer2, "./results/dqi_rstanarm_fit2.rds")
# saveRDS(dqi.lmer3, "./results/dqi_rstanarm_fit3.rds")

dqi.lmer1 <- readRDS("./results/dqi_rstanarm_fit1.rds")
dqi.lmer2 <- readRDS("./results/dqi_rstanarm_fit2.rds")
dqi.lmer3 <- readRDS("./results/dqi_rstanarm_fit3.rds")

# unclass is required for correct handling of rownames
out.dqi.lmer1 <- tibble::as_tibble(
  unclass(summary(dqi.lmer1, probs = c(0.025, 0.975))), rownames = "pars"
) %>%
  dplyr::mutate_at(vars(mean, `2.5%`, `97.5%`), round, 3) %>%
  dplyr::mutate(se = paste0("(", `2.5%`, ", ", `97.5%`, ")")) %>%
  dplyr::select(pars, mean, se) %>%
  tidyr::gather(x, `(1)`, -pars) %>%
  dplyr::select(pars, x, `(1)`)

out.dqi.lmer2 <- tibble::as_tibble(
  unclass(summary(dqi.lmer2, probs = c(0.025, 0.975))), rownames = "pars"
) %>%
  dplyr::mutate_at(vars(mean, `2.5%`, `97.5%`), round, 3) %>%
  dplyr::mutate(se = paste0("(", `2.5%`, ", ", `97.5%`, ")")) %>%
  dplyr::select(pars, mean, se) %>%
  tidyr::gather(x, `(2)`, -pars) %>%
  dplyr::select(pars, x, `(2)`)

out.dqi.lmer3 <- tibble::as_tibble(
  unclass(summary(dqi.lmer3, probs = c(0.025, 0.975))), rownames = "pars"
) %>%
  dplyr::mutate_at(vars(mean, `2.5%`, `97.5%`), round, 3) %>%
  dplyr::mutate(se = paste0("(", `2.5%`, ", ", `97.5%`, ")")) %>%
  dplyr::select(pars, mean, se) %>%
  tidyr::gather(x, `(3)`, -pars) %>%
  dplyr::select(pars, x, `(3)`)

ordering <- c(
  "GENfemale",
  "SP_TYPrepeat",
  "ISS_POLmedium",
  "ISS_POLhigh",
  "CON_DIVdyad",
  "CON_DIVmulti",
  "CON_LEVreg",
  "CON_TYPprof",
  "CON_TYPvic",
  "TRANSyes",
  "(Intercept)",
  "sigma",
  "Sigma[CON_NUM:(Intercept),(Intercept)]",
  "log-posterior"
)

labeling <- c(
  "Female",
  "Yes",
  "Medium",
  "High",
  "Dyadic",
  "Multi-Ethnic",
  "Regional",
  "Professionals",
  "Victims",
  "Translation",
  "Intercept",
  "$\\sigma_y$",
  "$\\sigma_{\\alpha}$",
  "log-posterior"
)

# Vector for using in if_else statement below for adding reference levels
reference <- rep(labeling, each = 3)
reference[seq(1,length(reference),3)] <- c(
  "Gender (ref: Male)",
  "Repeated Speaker (ref: No)",
  "Issue Polarization (ref: Low)",
  NA,
  "Diversity (ref: Mono-ethnic)",
  NA,
  "Level (ref: Non-regional)",
  "Type (ref: General)",
  NA,
  "Translation (ref: No)",
  NA,
  NA,
  NA,
  NA
)

tab3 <- out.dqi.lmer1 %>%
  dplyr::full_join(out.dqi.lmer2, by = c("pars", "x")) %>%
  dplyr::full_join(out.dqi.lmer3, by = c("pars", "x")) %>%
  dplyr::filter(!grepl("^b\\[\\(Intercept\\)|mean_PPD", pars)) %>%
  dplyr::mutate(pars = factor(pars, levels = ordering, labels = labeling)) %>%
  dplyr::mutate(`(1)` = if_else(is.na(`(1)`), "", `(1)`),
                `(2)` = if_else(is.na(`(2)`), "", `(2)`)) %>%
  dplyr::arrange(pars) %>%
  # Add reference levels of categorical variables
  dplyr::group_by(pars) %>%
  dplyr::do(add_row(., .before = 0)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    pars = if_else(is.na(pars), reference, as.character(pars)),
    x = if_else(is.na(x), "", x),
    `(1)` = if_else(is.na(`(1)`), "", `(1)`),
    `(2)` = if_else(is.na(`(2)`), "", `(2)`),
    `(3)` = if_else(is.na(`(3)`), "", `(3)`)
  ) %>%
  dplyr::filter(!is.na(pars)) %>%
  # remove SE for log-posterior
  dplyr::filter(pars != "log-posterior" | x != "se") %>%
  dplyr::select(-x) %>%
  dplyr::bind_rows(tibble::tibble(
    pars = "Groups",
    `(1)` = as.character(attr(summary(dqi.lmer1), "ngrps")),
    `(2)` = as.character(attr(summary(dqi.lmer2), "ngrps")),
    `(3)` = as.character(attr(summary(dqi.lmer3), "ngrps")))
  ) %>%
  dplyr::bind_rows(tibble::tibble(
    pars = "Observations",
    `(1)` = as.character(attr(summary(dqi.lmer1), "nobs")),
    `(2)` = as.character(attr(summary(dqi.lmer2), "nobs")),
    `(3)` = as.character(attr(summary(dqi.lmer3), "nobs")))
  ) %>%
  knitr::kable(
    col.names = c(" ", "(1)", "(2)", "(3)"),
    format = "latex",
    booktabs = TRUE,
    align = "c",
    escape = FALSE
  ) %>%
  kableExtra::collapse_rows(columns = 1, latex_hline = "none") %>%
  kableExtra::add_header_above(header = c(" " = 1, "DQI" = 3))

cat(tab3)


# Table 4 -----------------------------------------------------------------

# disc.meta <- meta %>%
#   # rle required atomic type
#   dplyr::mutate(gender = as.character(gender)) %>%
#   # assign sequence id
#   dplyr::mutate(seqid = with(rle(gender), rep(seq_along(lengths), lengths))) %>%
#   dplyr::group_by(seqid) %>%
#   dplyr::summarise(
#     duration = n(),
#     tokens = sum(tokens),
#     type = first(type),
#     gender = first(gender),
#     cid = first(cid),
#     diversity = first(diversity),
#     level = first(level),
#     community = first(community),
#     translation = first(translation),
#     perc_women_disc = first(perc_women_disc)
#   ) %>%
#   # arrange(gender, duration)
#   dplyr::mutate(
#     gender = factor(gender, levels = c("MAL", "FEM")),
#     diversity = relevel(diversity, ref = "MONO"),
#     translation = relevel(translation, ref = "NO")
#   )
# 
# poisson.stan1 <- rstanarm::stan_glmer(duration ~ gender + (1|cid), data = disc.meta, family = poisson(link = "log"))
# poisson.stan2 <- rstanarm::stan_glmer(duration ~ gender + perc_women_disc + diversity + level + community + translation + (1|cid), data = disc.meta, family = poisson(link = "log"))

# saveRDS(poisson.stan1, "./results/poisson_rstanarm_fit1.rds")
# saveRDS(poisson.stan2, "./results/poisson_rstanarm_fit2.rds")

poisson.stan1 <- readRDS("./results/poisson_rstanarm_fit1.rds")
poisson.stan2 <- readRDS("./results/poisson_rstanarm_fit2.rds")

ordering <- c(
  "genderFEM",
  "perc_women_disc",
  "diversityDYAD",
  "diversityMULTI",
  "levelREG",
  "communityPRO",
  "communityVIC",
  "translationYES",
  "(Intercept)",
  "Sigma[cid:(Intercept),(Intercept)]",
  "log-posterior"
)

labeling <- c(
  "Female",
  "\\% Female Discussants",
  "Dyadic",
  "Multi-Ethnic",
  "Regional",
  "Professionals",
  "Victims",
  "Yes",
  "Intercept",
  "$\\hat{\\sigma}_{\\alpha}$",
  "log-posterior"
)

# Vector for using in if_else statement below for adding reference levels
reference <- rep(labeling, each = 3)
reference[seq(1,length(reference),3)] <- c(
  "Gender (ref: Male)",
  NA,
  "Diversity (ref: Mono-ethnic)",
  NA,
  "Level (ref: Non-regional)",
  "Type (ref: General)",
  NA,
  "Translation (ref: No)",
  NA,
  NA,
  NA,
  NA
)

# unclass is required for correct handling of rownames
out.poisson1 <- tibble::as_tibble(
  unclass(summary(poisson.stan1, probs = c(0.025, 0.975))), rownames = "pars"
) %>%
  dplyr::mutate_at(vars(mean, `2.5%`, `97.5%`), round, 3) %>%
  dplyr::mutate(se = paste0("(", `2.5%`, ", ", `97.5%`, ")")) %>%
  dplyr::select(pars, mean, se) %>%
  tidyr::gather(x, `(1)`, -pars) %>%
  dplyr::select(pars, x, `(1)`)

out.poisson2 <- tibble::as_tibble(
  unclass(summary(poisson.stan2, probs = c(0.025, 0.975))), rownames = "pars"
) %>%
  dplyr::mutate_at(vars(mean, `2.5%`, `97.5%`), round, 3) %>%
  dplyr::mutate(se = paste0("(", `2.5%`, ", ", `97.5%`, ")")) %>%
  dplyr::select(pars, mean, se) %>%
  tidyr::gather(x, `(2)`, -pars) %>%
  dplyr::select(pars, x, `(2)`)

tab4 <- out.poisson1 %>%
  dplyr::full_join(out.poisson2, by = c("pars", "x")) %>%
  dplyr::filter(!grepl("^b\\[\\(Intercept\\)|mean_PPD", pars)) %>%
  dplyr::mutate(pars = factor(pars, levels = ordering, labels = labeling)) %>%
  dplyr::mutate(`(1)` = if_else(is.na(`(1)`), "", `(1)`),
                `(2)` = if_else(is.na(`(2)`), "", `(2)`)) %>%
  dplyr::arrange(pars) %>%
  # Add reference levels of categorical variables
  dplyr::group_by(pars) %>%
  dplyr::do(add_row(., .before = 0)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    pars = if_else(is.na(pars), reference, as.character(pars)),
    x = if_else(is.na(x), "", x),
    `(1)` = if_else(is.na(`(1)`), "", `(1)`),
    `(2)` = if_else(is.na(`(2)`), "", `(2)`)) %>%
  dplyr::filter(!is.na(pars)) %>%
  # remove SE for log-posterior
  dplyr::filter(pars != "log-posterior" | x != "se") %>%
  dplyr::select(-x) %>%
  dplyr::bind_rows(tibble::tibble(
    pars = "Groups",
    `(1)` = as.character(attr(summary(poisson.stan1), "ngrps")),
    `(2)` = as.character(attr(summary(poisson.stan2), "ngrps")))
  ) %>%
  dplyr::bind_rows(tibble::tibble(
    pars = "Observations",
    `(1)` = as.character(attr(summary(poisson.stan1), "nobs")),
    `(2)` = as.character(attr(summary(poisson.stan2), "nobs")))
  ) %>%
  knitr::kable(
    col.names = c(" ", "(1)", "(2)"),
    format = "latex",
    booktabs = TRUE,
    align = "c",
    escape = FALSE
  ) %>%
  kableExtra::collapse_rows(columns = 1, latex_hline = "none") %>%
  kableExtra::add_header_above(header = c(" " = 1, "Number of speeches in sequence" = 2))

cat(tab4)


# Figure 3 ----------------------------------------------------------------

# Prepare corpus
dqi_lemma_corpus <- quanteda::convert(dqiCorpusLemma, to = "data.frame") %>%
  dplyr::group_by(uid) %>%
  dplyr::summarise(
    text = paste(text, collapse = ""),
    dqi = first(dqi),
    region = first(region),
    level = first(level),
    community = first(community),
    cid = first(cid),
    id = first(id),
    sid = first(sid),
    type = first(type),
    gender = first(gender)
  ) %>%
  dplyr::filter(type == "DISC") %>%
  quanteda::corpus()

dqi_lemma_corpus <- dqi_lemma_corpus %>%
  quanteda::corpus_trim(what = "documents", min_ntoken = 10)

dqi_lemma_dfm <- quanteda::dfm(
  dqi_lemma_corpus,
  tolower = TRUE,
  stem = FALSE,
  remove_punct = TRUE
)

dqi_lemma_dfm <- dqi_lemma_dfm %>%
  quanteda::dfm_remove(stopwords_sr_lemmas, verbose = TRUE)

# stm_lemma_10 <- stm::stm(
#   dqi_lemma_dfm, 
#   seed = 700,
#   K = 10,
#   data = quanteda::docvars(dqi_lemma_dfm),
#   prevalence = ~ gender + region + level + community,
#   verbose = FALSE
# )

# saveRDS(stm_lemma_10, "./results/stm_lemma_10.rds")

stm_lemma_10 <- readRDS("./results/stm_lemma_10.rds")

stm.md <- stm::estimateEffect(
  1:10 ~ gender,
  stmobj = stm_lemma_10,
  metadata = quanteda::docvars(dqi_lemma_dfm)
)

# plot.stm <- plot(stm.md, "gender", cov.value1 = "MAL", cov.value2 = "FEM",  method = "difference", omit.plot = TRUE)
plot.stm <- plot(stm.md, "gender", cov.value1 = "MAL", cov.value2 = "FEM",  method = "difference")
plot.stm <- plot.stm %>%
  as_tibble() %>%
  mutate(
    topics = rev(topics),
    means = unlist(means),
    ci25 = unlist(lapply(cis, `[`, 1)),
    ci975 = unlist(lapply(cis, `[`, 2))
  ) %>%
  select(-cis)

fig_3 <- ggplot(plot.stm) +
  geom_vline(xintercept = 0, color = "gray90", size = 0.5) +
  geom_segment(aes(x = ci25, xend = ci975, y = topics, yend = topics)) +
  geom_point(aes(x = means, y = topics), colour = "red") +
  labs(title = "") +
  xlab("Gender coefficient (male vs female)") +
  scale_y_continuous("",
                     breaks = seq(10, 1, by = -1),
                     labels = c(
                       expression(Modality),
                       "Bosnia",
                       "Acknowledgement",
                       expression(Kosovo),
                       expression(Implementation),
                       "Implications",
                       expression(Outcome),
                       expression(Reconciliation),
                       "Evaluation",
                       "T10")) +
  theme_classic() +
  theme(
    plot.title = element_text(size = rel(0.6), hjust = 0.5),
    axis.title.x = element_text(size = rel(1), colour = "black"),
    axis.text.x = element_text(size = rel(0.8), colour = "black"),
    axis.text.y = element_text(size = rel(0.8), colour = "black"),
    axis.line = element_line(size = rel(0.5)),
    axis.ticks.x = element_line(size = rel(0.5)),
    axis.ticks.y = element_line(size = rel(0.5)),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank()
  )

# EPS
ggsave(
  "./figures/figure_3.eps",
  plot = fig_3,
  device = cairo_ps,
  width = 160,
  height = 90,
  units = "mm"
)


# Figure 2 ----------------------------------------------------------------
# some objects from Figure 3 are re-used

logbeta <- stm_lemma_10$beta$logbeta
logbeta <- tibble::as_tibble(t(exp(logbeta[[1]])))
vocab <- tibble::tibble(word = stm_lemma_10$vocab)

ordering <- unname(unlist(Map(paste0, rep("V", 10), seq(1,10))))

topics <- logbeta %>%
  dplyr::bind_cols(vocab) %>%
  tidyr::gather(topic, prob, -word) %>%
  dplyr::mutate(topic = factor(topic, levels = ordering)) %>%
  dplyr::group_by(topic) %>%
  dplyr::top_n(10, prob) %>%
  dplyr::mutate(name = gsub("V", "T", topic)) %>%
  dplyr::arrange(topic, desc(prob), word) %>%
  # Change the original words into translated
  dplyr::mutate(
    word = case_when(
      topic == "V1" ~ c("think", "article", "criteria", "say", "tell", "state", "person", "president", "high", "know"),
      topic == "V2" ~ c("member", "bosnia", "herzegovina", "think", "year", "number", "commission", "two", "three", "state"),
      topic == "V3" ~ c("person", "year", "family", "say", "know", "truth", "tell", "come", "missing", "crime"),
      topic == "V4" ~ c("year", "kosovo", "know", "war", "victim", "say", "person", "crime", "tell", "speak"),
      topic == "V5" ~ c("article", "statute", "commission", "state", "think", "recom", "document", "inter-state", "manner", "proposal"),
      topic == "V6" ~ c("fact", "crime", "recom", "establish", "think", "commission", "court", "report", "war", "doubt"),
      topic == "V7" ~ c("recom", "think", "aim", "year", "war", "say", "conflict", "fact", "context", "commission"),
      topic == "V8" ~ c("victim", "person", "recom", "war", "think", "year", "question", "commission", "initiative", "regional"),
      topic == "V9" ~ c("commission", "think", "criminal", "court", "say", "statement", "state", "witness", "recom", "law"),
      topic == "V10" ~ c("right", "crime", "human", "victim", "war", "violation", "article", "commission", "grave", "think")
    )) %>%
  dplyr::mutate(x = 1,
                y = seq(n(), 1))

topic_nums <- c(
  V1 = "Topic 1",
  V2 = "Topic 2",
  V3 = "Topic 3",
  V4 = "Topic 4",
  V5 = "Topic 5",
  V6 = "Topic 6",
  V7 = "Topic 7",
  V8 = "Topic 8",
  V9 = "Topic 9",
  V10 = "Topic 10"
)

topic_labels <- c(
  T1 = "Modality",
  T2 = "Bosnia",
  T3 = "Acknowledgment",
  T4 = "Kosovo",
  T5 = "Implementation",
  T6 = "Implications",
  T7 = "Outcome",
  T8 = "Reconciliation",
  T9 = "Evaluation",
  T10 = "(mixture)"
)

fig_2 <- ggplot(topics, aes(x, y)) +
  geom_text(aes(label = word, size = prob), colour = "steelblue4") +
  scale_size(range = c(1,2.7)) +
  facet_grid(cols = vars(topic, name),
             labeller = labeller(topic = topic_nums,
                                 name = topic_labels)) +
  labs(x = "",
       y = "") +
  theme(plot.title = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        panel.spacing = unit(0.2, "lines"),
        strip.text = element_text(size = rel(0.42)),
        strip.background = element_rect(colour = "black", fill = "white"),
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none")

# EPS
ggsave(
  "./figures/figure_2.eps",
  plot = fig_2,
  device = cairo_ps,
  width = 160,
  height = 90,
  units = "mm"
)
